loading data

For the DESeq2 analysis, the raw, prefiltered counts will be used.

se.gastroc <- readRDS("./data/Robjects/01_se.gastroc.rds")
se.soleus <- readRDS("./data/Robjects/01_se.soleus.rds")

counts.gastroc <- se.gastroc[rowData(se.gastroc)$filtered,] %>% assay()
counts.soleus <- se.soleus[rowData(se.soleus)$filtered,] %>% assay()
metadata <- colData(se.gastroc)

preparing DDS objects

# trying to reorder the metadata, so that the design is correct (so far no luck)
# metadata_2 <- as.data.frame(metadata)
# metadata_2$genotype <- factor(metadata_2$genotype, levels = c("KO", "WT"))

# metadata_2 <- rbind(metadata[7:12,,drop=FALSE], metadata[1:6,,drop=FALSE])

metadata$genotype <- as.factor(metadata$genotype) # for DESeq



createDDSObject <- function(counts, metadata) {
  # select sample columns
  reorder_index <- match(rownames(metadata), colnames(counts))
  counts <- counts[,reorder_index] %>% 
    apply(c(1,2), as.integer) # DESeq need normal counts (integer) as input
  
  # Check metadata consistency
  all(rownames(metadata) %in% colnames(counts)) %>%
    assertthat::assert_that(., msg = "metadata and count table do not match")
  
  ## DESeq2 object
  dds <- DESeqDataSetFromMatrix(countData = counts,
                                colData = metadata,
                                design = ~ genotype)
  
  return( DESeq(dds) )
}

# creating DESeq Objects
dds.gastroc <- createDDSObject(counts.gastroc, metadata)
dds.soleus <- createDDSObject(counts.soleus, metadata)

Results

res.gastroc <- results(dds.gastroc, alpha = params$pCutoff, contrast = c("genotype", "KO", "WT"))
res.soleus <- results(dds.soleus, alpha = params$pCutoff, contrast = c("genotype", "KO", "WT"))
# lfcShrink is not applied by default

# library(ashr) # using instead of "apeglm", because https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#extended-section-on-shrinkage-estimators

# resultsNames(dds.gastroc)
resLFC.gastroc <- lfcShrink(dds.gastroc, contrast = c("genotype", "KO", "WT"), type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
resLFC.soleus <- lfcShrink(dds.soleus, contrast = c("genotype", "KO", "WT"), type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041

count outlier detection with cooks distance

Filtering outliers using the cooks distance is done automatically and p-values are set to NA. Changing (lowering) the minReplicatesForReplace will replace these flagged values.

DESeq2 flags these outliers and removes the corresponding genes completely, if below 7 samples (above would replace the values)

# assuming all NA p-values are the ones filtered out using the cooks distance

print_outliers <- function(res, se){
  # all genes with a `NA` value are exactly those 5 which were also detected by the cooks distance:
  outlier_ids <- res %>% 
    as.data.frame() %>% 
    filter(is.na(pvalue)) %>% 
    rownames()
  
  # a different approach would be finding the genes which have a cooks distance
  # above the cutoff:
  # cooksCutoff <- qf(.99, 2,10)
  # outlier_ids <- which(mcols(dds)$maxCooks > cooksCutoff) %>% names()
  
  # getting counts and metadata
  outlier_metadata.df <- rowData(se) %>%
    as.data.frame() %>%
    select(gene_name, gene_biotype)
  
  counts.df <- assay(se)
  
  # merging df
  outliers.df <-
    merge(outlier_metadata.df, counts.df[outlier_ids,], by = 0)
  return(outliers.df)
}

outliers_gastroc.df <- print_outliers(res.gastroc, se.gastroc) %>% tibble::column_to_rownames(var = "Row.names")
write.csv(outliers_gastroc.df, file = "./code/analysis/03_outliers_gastroc.csv")
outliers_gastroc.df %>% knitr::kable()
print_outliers(res.soleus, se.soleus) %>% knitr::kable()
gene_name gene_biotype WT_1 WT_3 WT_5 WT_9 WT_11 WT_15 KO_2 KO_4 KO_6 KO_8 KO_10 KO_16
ENSMUSG00000027375 Mal protein_coding 496 82 87 88 118 71 112 80 70 99 164 200
ENSMUSG00000039579 Grin3a protein_coding 4 1 5 3 0 3 12 5 11 2 9 142
ENSMUSG00000041607 Mbp protein_coding 3390 462 524 654 836 536 849 493 456 606 1305 1094
ENSMUSG00000053198 Prx protein_coding 1190 159 236 327 346 200 313 196 197 237 428 372
ENSMUSG00000056569 Mpz protein_coding 7949 1168 1229 1562 1657 996 1792 1181 877 1447 2681 2691
Row.names gene_name gene_biotype WT_1 WT_3 WT_5 WT_9 WT_11 WT_15 KO_2 KO_4 KO_6 KO_8 KO_10 KO_16
ENSMUSG00000000290 Itgb2 protein_coding 22 47 45 50 72 18 38 457 44 58 66 63
ENSMUSG00000000682 Cd52 protein_coding 8 34 13 18 19 24 10 204 23 15 21 22
ENSMUSG00000000982 Ccl3 protein_coding 0 0 0 3 1 0 1 25 0 0 3 0
ENSMUSG00000001131 Timp1 protein_coding 21 39 25 32 30 38 94 1080 85 33 81 68
ENSMUSG00000002985 Apoe protein_coding 762 1049 1260 1220 1418 1005 1417 10248 1592 1330 1474 1369
ENSMUSG00000003484 Cyp4f18 protein_coding 2 7 3 8 9 3 17 166 10 15 12 4
ENSMUSG00000003882 Il7r protein_coding 3 3 1 7 2 0 2 129 3 5 6 6
ENSMUSG00000004707 Ly9 protein_coding 0 0 4 5 8 0 4 122 5 2 0 1
ENSMUSG00000005087 Cd44 protein_coding 183 229 200 236 240 190 324 1750 335 262 414 265
ENSMUSG00000006519 Cyba protein_coding 135 138 144 132 124 165 177 732 128 166 214 129
ENSMUSG00000015947 Fcgr1 protein_coding 6 5 12 6 10 3 7 143 15 4 10 6
ENSMUSG00000015950 Ncf1 protein_coding 22 51 59 45 71 18 67 497 76 36 75 55
ENSMUSG00000017716 Birc5 protein_coding 7 21 7 8 15 16 16 227 18 19 18 21
ENSMUSG00000018927 Ccl6 protein_coding 83 164 172 135 175 95 206 1248 182 114 204 109
ENSMUSG00000019876 Pkib protein_coding 2 2 6 2 4 4 4 92 3 3 1 4
ENSMUSG00000019987 Arg1 protein_coding 5 6 2 13 6 1 8 108 6 17 10 9
ENSMUSG00000020120 Plek protein_coding 17 39 30 32 48 22 36 407 44 32 21 29
ENSMUSG00000020865 Abcc3 protein_coding 14 13 22 16 19 11 17 184 23 19 23 13
ENSMUSG00000021091 Serpina3n protein_coding 19 22 29 22 22 22 65 404 42 31 49 57
ENSMUSG00000021175 Cdca7l protein_coding 0 3 2 2 0 0 0 34 2 2 0 2
ENSMUSG00000021886 Gpr65 protein_coding 0 2 5 4 5 5 5 56 5 4 6 3
ENSMUSG00000021998 Lcp1 protein_coding 95 87 97 126 124 73 134 1457 185 129 187 73
ENSMUSG00000022534 Mefv protein_coding 1 9 1 1 2 0 0 33 2 1 0 2
ENSMUSG00000024300 Myo1f protein_coding 19 34 19 37 26 22 25 349 42 32 23 25
ENSMUSG00000024397 Aif1 protein_coding 9 6 7 4 9 10 5 128 8 9 15 10
ENSMUSG00000024529 Lox protein_coding 74 190 110 93 154 108 201 820 161 129 151 152
ENSMUSG00000024675 Ms4a4c protein_coding 0 0 5 0 2 0 3 26 0 0 3 0
ENSMUSG00000024677 Ms4a6b protein_coding 13 32 27 24 42 16 21 179 23 18 23 19
ENSMUSG00000024679 Ms4a6d protein_coding 9 17 12 15 21 13 12 305 22 20 15 6
ENSMUSG00000025044 Msr1 protein_coding 10 7 12 15 22 2 2 358 23 14 15 17
ENSMUSG00000025473 Adam8 protein_coding 30 57 44 53 39 38 49 525 45 38 52 72
ENSMUSG00000025877 Hk3 protein_coding 9 10 7 3 9 2 11 165 9 16 10 10
ENSMUSG00000026358 Rgs1 protein_coding 3 4 0 0 1 0 0 88 0 2 0 1
ENSMUSG00000029304 Spp1 protein_coding 77 47 68 34 78 113 97 4879 96 61 63 62
ENSMUSG00000029373 Pf4 protein_coding 56 69 105 115 111 45 57 846 99 83 75 50
ENSMUSG00000029915 Clec5a protein_coding 8 10 1 7 4 8 7 86 11 6 10 6
ENSMUSG00000030117 Gdf3 protein_coding 0 0 0 0 0 0 0 129 10 1 3 1
ENSMUSG00000030144 Clec4d protein_coding 0 0 0 2 1 4 3 110 4 0 5 3
ENSMUSG00000030365 Clec2i protein_coding 3 3 2 8 2 5 0 44 2 3 4 0
ENSMUSG00000030579 Tyrobp protein_coding 19 25 45 39 25 18 24 508 34 13 26 20
ENSMUSG00000030786 Itgam protein_coding 9 36 16 20 39 6 18 450 36 14 36 23
ENSMUSG00000030789 Itgax protein_coding 24 20 8 9 9 10 17 141 20 19 29 24
ENSMUSG00000031443 F7 protein_coding 1 9 4 3 1 2 16 180 4 3 13 7
ENSMUSG00000031827 Cotl1 protein_coding 76 128 118 132 133 101 134 1350 156 175 174 125
ENSMUSG00000032122 Slc37a2 protein_coding 26 44 36 27 20 29 25 244 35 52 36 22
ENSMUSG00000032218 Ccnb2 protein_coding 14 22 24 4 13 17 27 129 15 26 20 15
ENSMUSG00000032322 Pstpip1 protein_coding 8 29 19 15 26 28 30 186 30 36 34 29
ENSMUSG00000032511 Scn5a protein_coding 47 49 51 86 108 55 74 542 87 73 92 87
ENSMUSG00000033220 Rac2 protein_coding 17 16 23 27 13 15 23 186 14 20 33 26
ENSMUSG00000033777 Tlr13 protein_coding 3 6 2 10 11 9 17 217 23 12 8 5
ENSMUSG00000034947 Tmem106a protein_coding 45 41 84 76 80 53 64 391 63 86 97 50
ENSMUSG00000035202 Lars2 protein_coding 1830 4441 2177 1921 1785 1989 7700 1688 1395 1567 1838 1302
ENSMUSG00000035373 Ccl7 protein_coding 3 3 3 14 3 0 5 398 16 17 10 6
ENSMUSG00000035385 Ccl2 protein_coding 10 12 11 43 15 1 16 348 14 12 41 5
ENSMUSG00000036887 C1qa protein_coding 115 166 220 242 260 142 199 1303 256 195 226 180
ENSMUSG00000037280 Galnt6 protein_coding 4 4 4 2 3 3 4 108 7 1 6 3
ENSMUSG00000037649 H2-DMa protein_coding 28 39 31 55 20 24 59 227 37 36 47 49
ENSMUSG00000038147 Cd84 protein_coding 8 16 13 8 16 10 4 408 15 9 21 32
ENSMUSG00000038642 Ctss protein_coding 74 111 107 110 99 50 110 3593 202 87 143 94
ENSMUSG00000040204 Pclaf protein_coding 8 8 8 7 5 5 3 197 15 13 14 6
ENSMUSG00000040552 C3ar1 protein_coding 14 13 32 36 51 3 48 689 41 20 49 22
ENSMUSG00000040747 Cd53 protein_coding 21 24 32 51 38 15 39 373 52 30 38 38
ENSMUSG00000040809 Chil3 protein_coding 1 0 0 0 6 0 0 66 0 6 4 1
ENSMUSG00000040907 Atp1a3 protein_coding 5 7 3 4 5 6 3 148 8 3 13 12
ENSMUSG00000041431 Ccnb1 protein_coding 4 14 2 1 5 0 5 118 5 1 13 8
ENSMUSG00000043091 Tuba1c protein_coding 113 105 81 135 172 142 126 606 125 108 119 107
ENSMUSG00000043157 Arl11 protein_coding 4 4 5 3 11 2 8 114 8 1 3 4
ENSMUSG00000044811 Cd300c2 protein_coding 4 7 4 8 3 3 9 178 12 14 12 11
ENSMUSG00000044827 Tlr1 protein_coding 3 1 3 0 1 0 2 78 6 3 9 2
ENSMUSG00000046805 Mpeg1 protein_coding 29 58 47 47 57 35 59 2548 237 45 87 67
ENSMUSG00000047798 Cd300lf protein_coding 0 3 1 1 2 1 6 126 1 0 2 3
ENSMUSG00000048779 P2ry6 protein_coding 27 15 31 45 52 29 47 274 38 35 55 34
ENSMUSG00000052142 Rasal3 protein_coding 10 5 3 7 6 4 3 54 7 4 4 1
ENSMUSG00000052336 Cx3cr1 protein_coding 12 18 11 10 24 7 27 366 23 8 9 12
ENSMUSG00000055850 Rnf181 protein_coding 41 20 42 23 45 355 48 38 37 35 341 29
ENSMUSG00000056069 Fam105a protein_coding 29 28 28 22 29 26 30 270 37 30 25 31
ENSMUSG00000063415 Cyp26b1 protein_coding 248 1257 198 94 293 99 4027 59 111 54 37 191
ENSMUSG00000068606 Gm4841 protein_coding 3 0 161 1 1 2 0 45 8 265 37 14
ENSMUSG00000069516 Lyz2 protein_coding 408 806 793 912 972 506 672 12094 1278 687 879 561
ENSMUSG00000070780 Rbm47 protein_coding 3 7 5 2 6 1 4 89 7 5 10 5
ENSMUSG00000073412 Lst1 protein_coding 6 2 7 5 5 3 4 57 2 4 4 6
ENSMUSG00000076258 Gm23935 miRNA 2112 8702 2434 2297 1344 2648 16331 1687 1437 1455 2057 1453
ENSMUSG00000076281 Gm24270 miRNA 22 63 36 28 17 30 152 18 12 14 30 20
ENSMUSG00000078122 F630028O10Rik antisense 0 15 11 12 13 8 5 97 8 15 9 9
ENSMUSG00000078880 Gm14308 protein_coding 4 1 106 1 0 0 2 4 4 2 3 0
ENSMUSG00000078899 Gm4631 protein_coding 0 0 0 3 0 3 81 0 1 1 1 1
ENSMUSG00000079227 Ccr5 protein_coding 13 5 2 21 21 5 15 262 16 16 14 5
ENSMUSG00000079419 Ms4a6c protein_coding 3 12 11 8 11 7 9 85 9 6 5 5
ENSMUSG00000095380 Gm23952 miRNA 3 55 7 5 5 7 10 5 6 7 7 8
ENSMUSG00000103349 Gm36888 lincRNA 2 1 14 0 1 3 4 0 0 0 24 0
ENSMUSG00000115230 AC101945.2 lincRNA 0 0 0 0 0 0 2 37 0 1 1 3

size factors

sizeFactors(dds.gastroc)
sizeFactors(dds.soleus)
##      WT_1      WT_3      WT_5      WT_9     WT_11     WT_15      KO_2      KO_4      KO_6      KO_8     KO_10 
## 1.0098829 0.6517416 0.7363933 1.1208488 0.9796707 0.7135590 1.2740994 1.0606543 1.0397324 1.2073234 1.0876389 
##     KO_16 
## 1.6003663 
##      WT_1      WT_3      WT_5      WT_9     WT_11     WT_15      KO_2      KO_4      KO_6      KO_8     KO_10 
## 0.8782302 0.9413594 0.9933758 0.9259800 1.0048036 0.8889040 1.1802624 1.1871833 0.9450776 1.1361948 1.1343119 
##     KO_16 
## 0.9180831

Dispersion Estimates

gastroc

plotDispEsts(dds.gastroc)

soleus

plotDispEsts(dds.soleus)

MA-Plots

gastroc

plotMA(res.gastroc, ylim = c(-6,6))

soleus

plotMA(res.soleus, ylim = c(-6,6))

top significant diff. genes

topGenes.gastroc <- as.data.frame(res.gastroc) %>%
  tibble::rownames_to_column("GeneID") %>%
  top_n(100, wt = -padj) %>%
  arrange(padj)

topGenes.soleus <- as.data.frame(res.soleus) %>%
  tibble::rownames_to_column("GeneID") %>%
  top_n(100, wt = -padj) %>%
  arrange(padj)

knitr::kable(head(topGenes.gastroc), caption = "gastroc")
knitr::kable(head(topGenes.soleus), caption = "soleus")
gastroc
GeneID baseMean log2FoldChange lfcSE stat pvalue padj
ENSMUSG00000050335 4934.2949 6.267396 0.1664065 37.66316 0 0
ENSMUSG00000081249 1378.5460 4.208222 0.1115955 37.70959 0 0
ENSMUSG00000010751 1864.4937 5.610357 0.2014267 27.85309 0 0
ENSMUSG00000032712 5136.5059 4.686418 0.1683204 27.84225 0 0
ENSMUSG00000034855 554.3475 7.276611 0.2735037 26.60517 0 0
ENSMUSG00000037613 960.7558 3.183338 0.1266510 25.13472 0 0
soleus
GeneID baseMean log2FoldChange lfcSE stat pvalue padj
ENSMUSG00000038615 32047.185 -2.5454285 0.0636829 -39.97038 0 0
ENSMUSG00000081249 1651.592 4.5759032 0.1292779 35.39587 0 0
ENSMUSG00000039067 4355.557 -1.1049116 0.0419786 -26.32084 0 0
ENSMUSG00000028932 3190.657 -1.1102258 0.0558536 -19.87741 0 0
ENSMUSG00000006998 7402.617 -0.9196943 0.0468631 -19.62511 0 0
ENSMUSG00000102869 6985.622 -0.9181929 0.0479924 -19.13203 0 0

p-values

# TODO: use ggplot for unified look of plots
hist(res.gastroc$pvalue, main = "gastroc")

hist(res.soleus$pvalue, main = "soleus")

Volcano Plot

volcanoPlot <- function(result, se, pCutoff = 0.01, FCutoff = 1, tissue = character()) {
  gene_names <-
    rowData(se)[rownames(result), c("gene_name"), drop = F]
  results.df <- result %>%
    as.data.frame() %>%
    dplyr::arrange(padj)
  
  # top 10 gene labels for respectively up and down regulation
  labs.up <- results.df[results.df$log2FoldChange > FCutoff, ] %>%
    rownames() %>% .[1:10] %>% gene_names[., c("gene_name")]
  labs.down <- results.df[results.df$log2FoldChange < -FCutoff, ] %>%
    rownames() %>% .[1:10] %>% gene_names[., c("gene_name")]
  selectLab <- c(labs.up, labs.down, "Nfe2l1") %>% unique() # always including "Nfe2l1"
  
  # custom colors:
  keyvals <- ifelse(
    result$log2FoldChange > FCutoff &
      result$padj < pCutoff,
    params$regulated_pal$upregulated,
    ifelse(
      result$log2FoldChange < -FCutoff &
        result$padj < pCutoff,
      params$regulated_pal$downregulated,
      params$regulated_pal$insignificant
    )
  )

  keyvals[is.na(keyvals)] <- params$regulated_pal$insignificant
  names(keyvals)[keyvals == params$regulated_pal$upregulated] <- 'up regulated'
  names(keyvals)[keyvals == params$regulated_pal$insignificant] <- 'nonsignificant'
  names(keyvals)[keyvals == params$regulated_pal$downregulated] <- 'down regulated'
  
  vlc.plt <- EnhancedVolcano(
    result,
    x = 'log2FoldChange',
    y = 'padj',
    title = 'KO vs WT: Nfe2l1 knockout',
    subtitle = ifelse(isEmpty(tissue), "", paste0('tissue: ', tissue)),
    caption = "",
    ylab = expression(paste(-Log[10], p[adj])),
    pCutoff = pCutoff,
    FCcutoff = FCutoff,
    legendPosition = 'right',
    pointSize = 2,
    colCustom = keyvals,
    lab = gene_names$gene_name,
    selectLab = selectLab,
    labSize = 3,
    boxedLabels = TRUE,
    drawConnectors = TRUE,
    max.overlaps = Inf
  )
  
  return(vlc.plt)
}
p <- volcanoPlot(res.gastroc, se.gastroc, pCutoff = params$pCutoff, FCutoff = params$FCutoff, tissue = "gastrocnemius")
ggsave(filename = "plots/02_volcano_gastroc.svg", p)
## Saving 9 x 9 in image
ggsave(filename = "plots/02_volcano_gastroc.png", p)
## Saving 9 x 9 in image
p

p <- volcanoPlot(res.soleus, se.soleus, pCutoff = params$pCutoff, FCutoff = params$FCutoff, tissue = "soleus")
ggsave(filename = "plots/02_volcano_soleus.svg", p)
## Saving 9 x 9 in image
ggsave(filename = "plots/02_volcano_soleus.png", p)
## Saving 9 x 9 in image
p

p <- volcanoPlot(resLFC.gastroc, se.gastroc, pCutoff = params$pCutoff, FCutoff = params$FCutoff, tissue = "gastrocnemius")
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
ggsave(filename = "plots/02_volcano_gastroc_LFC.svg", p)
## Saving 9 x 9 in image
ggsave(filename = "plots/02_volcano_gastroc_LFC.png", p)
## Saving 9 x 9 in image
p

p <- volcanoPlot(resLFC.soleus, se.soleus, pCutoff = params$pCutoff, FCutoff = params$FCutoff, tissue = "soleus")
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest non-zero p-value...
ggsave(filename = "plots/02_volcano_soleus_LFC.svg", p)
## Saving 9 x 9 in image
ggsave(filename = "plots/02_volcano_soleus_LFC.png", p)
## Saving 9 x 9 in image
p

scatter-plot: most differential genes, both tissues

using the Wald-test stat from the DESeq2 result and filtering on the set FCutoff=1 and pCutoff=0.01 yields the following plot:

apply_cutoffs <- function(deseq.result, colname="stat", FCutoff, pCutoff) {
  res.filtered <- deseq.result %>%
    data.frame() %>%
    filter(padj < pCutoff,
           log2FoldChange > FCutoff | log2FoldChange < -FCutoff) %>%
    dplyr::rename(!!colname := stat) %>%
    dplyr::select(!!colname)
  return(res.filtered)
}

gastroc_res.filtered <- apply_cutoffs(res.gastroc, colname="gastroc", params$FCutoff, params$pCutoff)
soleus_res.filtered  <- apply_cutoffs(res.soleus,  colname="soleus",  params$FCutoff, params$pCutoff)

gene_names <- rowData(se.gastroc) %>% as.data.frame() %>% 
  dplyr::select(gene_name)

# combining Wald-Test data from both tissues and ordering in quadrants
res.combined <- merge(gastroc_res.filtered,
                      soleus_res.filtered,
                      by = 0) %>%
  mutate(diff.exp = case_when(
    gastroc < 0 & soleus < 0 ~ "both down",
    gastroc > 0 & soleus > 0 ~ "both up",
    gastroc < 0 & soleus > 0 ~ "gastrocnemius down,\n soleus up",
    gastroc > 0 & soleus < 0 ~ "gastrocnemus up,\n soleus down",
    TRUE ~ "different"
  )) %>% 
  merge(gene_names, by.x="Row.names", by.y=0)
 
# removing all gene_names except the top_n_genes (sum of absolute Wald-test numbers)
top_n_genes <- 10
top_labels <- res.combined %>%
  group_by(diff.exp) %>% 
  arrange(desc(abs(gastroc) + abs(soleus))) %>%
  filter(row_number() %in% c(1:top_n_genes)) %>% 
  ungroup() %>% 
  .$gene_name

res.combined <- res.combined %>% 
  mutate(gene_name = ifelse(gene_name %in% top_labels, gene_name, ""))

# final plot
p <- ggplot(res.combined, aes(x = gastroc, y = soleus, label = gene_name)) +
  geom_vline(xintercept = 0) + 
  geom_hline(yintercept = 0) + 
  geom_point(aes(color = diff.exp)) +
  # scale_color_manual(values = c("red", "chartreuse1", "bisque", "royalblue")) +
  labs(x = "gastrocnemius", y = "soleus", color = "significantly\ndifferentially\nexpressed") +
  ggrepel::geom_label_repel(max.overlaps = 15) + 
  ggtitle(label = "DEA: t-statistics (Wald test)") +
  theme_bw() +
  theme(
    axis.title = element_text(size = 13),
    axis.text = element_text(size = 13),
    title = element_text(size = 13),
    legend.text = element_text(size = 10)
  )

ggsave(filename = "plots/02_dea_scatter.svg", p)
## Saving 9 x 9 in image
ggsave(filename = "plots/02_dea_scatter.png", p)
## Saving 9 x 9 in image
p

barplot

p <- ggplot(res.combined, aes(x = diff.exp)) +
  geom_bar(aes(fill = diff.exp)) +
  theme_bw()
ggsave(filename = "plots/02_dea_scatter_barplot.svg", p)
## Saving 7 x 5 in image
ggsave(filename = "plots/02_dea_scatter_barplot.png", p)
## Saving 7 x 5 in image
p

boxplots: significant genes in both tissues

boxplot.genes <- function(counts.df, title ="", FCutoff = 1) {
  
  ggplot(counts.df,
         aes(
           # x = as.factor(gene_name),
           x = reorder(gene_name,ensembl),
           y = normalized_counts,
           fill = genotype
         )) +
    geom_dotplot(
      binaxis = 'y',
      stackdir = 'center',
      dotsize = 0.3,
      position = position_dodge(0.8),
      fill = "black"
    ) +
    geom_boxplot(outlier.size = 0.3) +
    scale_y_log10() +
    xlab("Genes") +
    ylab("Normalized Counts") +
    scale_fill_manual(values = condition_pal) +
    ggtitle(title) +
    theme_bw() +
    theme(plot.title = element_text(hjust = 0.5)) +
    theme(axis.text.x = element_text(
      angle = 45,
      hjust = 1,
      face = setBold(counts.df$gene_name, c("Nfe2l1"))
    ))
}

prepare_counts <- function(gene_ids, dds, se, reorder_genes = F) {
  counts.top <- counts(dds, normalized = T)[gene_ids, ]
  metadata <- colData(se) %>% as.data.frame()
  gene_names <-
    rowData(se)[, c("gene_name"), drop = F] %>% as.data.frame()
  
  # the prepared counts df for the plot
  counts.plt <-
    data.frame(counts.top) %>%
    tibble::rownames_to_column(var = "ensembl") %>%
    tidyr::gather(key = "samplename",
                  value = "normalized_counts", 2:13) %>%
    merge(metadata, by.x = "samplename", by.y = 0) %>%
    merge(gene_names, by.x = "ensembl", by.y = 0)  %>%
    mutate(genotype = factor(genotype)) %>%
    mutate(genotype = relevel(genotype, "WT")) # "WT" needs to be displayed before "KO"
    
    if (reorder_genes) {
      counts.plt <- counts.plt %>%
        mutate(
          gene_name = forcats::fct_reorder(gene_name, normalized_counts, .desc = T),
          ensembl = forcats::fct_reorder(ensembl, normalized_counts, .desc = T)
        )
    } else {
      idx = match(counts.plt$ensembl, gene_ids)
      counts.plt <- counts.plt[order(idx),] %>%
        mutate(gene_name = factor(gene_name))# %>%
        # arrange(gene_name)
      counts.plt$ensembl <- factor(counts.plt$ensembl, levels = gene_ids,ordered = TRUE)
      counts.plt$gene_name <- factor(counts.plt$gene_name)

      # counts.plt$gene_name <- factor(counts.plt$gene_name, levels = gene_ids)
      # counts.plt <- counts.plt %>% 
      #   mutate(gene_name = factor(gene_name, levels = gene_ids)) %>% 
      #   arrange()
    }
  return(counts.plt)
}


# helper function to set the font of a label on the axis to bold
setBold <- function(src, special_labs) {
  # source: https://stackoverflow.com/questions/39694490/highlighting-individual-axis-labels-in-bold-using-ggplot2
  if (!is.factor(src))
    src <- factor(src)
  src_levels <- base::levels(src)
  brave <- special_labs %in% src_levels
  b_vec <- rep("plain", length(src_levels))
  if (all(brave)) {
    b_pos <- purrr::map_int(special_labs, ~ which(. == src_levels))
    b_vec[b_pos] <- "bold"
    b_vec
  } else {
    message("setBold: no matching element found")
  }
  return(b_vec)
}

# actual plot call
for (group in unique(res.combined$diff.exp)) {
  gene_ids <- filter(res.combined, diff.exp == group)$`Row.names`
  prep_counts.gastroc <- prepare_counts(gene_ids, dds.gastroc, se.gastroc, reorder_genes = T)
  prep_counts.soleus <-
    prepare_counts(levels(prep_counts.gastroc$ensembl), dds.soleus, se.soleus, reorder_genes = T)
  gene_ids <- levels(prep_counts.gastroc$ensembl)
  
  p.g <- boxplot.genes(
    counts.df = prep_counts.gastroc,
    title = paste0("Sign. regulated genes: ", group, "\n tissue: ", "gastrocnemius")
  ) #%>% print()
  
  p.s <- boxplot.genes(
    counts.df = prep_counts.soleus,
    title = paste0("\n tissue: ", "soleus")
  ) #%>% print()
  
  ggpubr::ggarrange(p.g, p.s, ncol = 1, nrow = 2)
}

boxplots: top 20 significant genes

setBold <- function(src, special_labs) {
  # source: https://stackoverflow.com/questions/39694490/highlighting-individual-axis-labels-in-bold-using-ggplot2
  if (!is.factor(src))
    src <- factor(src)
  src_levels <- base::levels(src)
  brave <- special_labs %in% src_levels
  b_vec <- rep("plain", length(src_levels))
  if (all(brave)) {
    b_pos <- purrr::map_int(special_labs, ~ which(. == src_levels))
    b_vec[b_pos] <- "bold"
    b_vec
  } else {
    message("setBold: no matching element found")
  }
  return(b_vec)
}

getTopExpressedEnsemblNames <- function(result,
                                        FCutoff,
                                        n,
                                        up = T) {
  .filter <- ifelse(up, `>`, `<`)
  
  subset(result, .filter(log2FoldChange, FCutoff)) %>%
    data.frame() %>%
    filter(baseMean > 100) %>%
    arrange(padj) %>%
    .[1:n, ] %>%
    rownames()
}

boxplot.top <- function(result, dds, se, upregulated=TRUE, n = 20, FCutoff = 1, tissue ="") {
  # getting the top n regulated genes
  names.top <- getTopExpressedEnsemblNames(result, FCutoff=FCutoff, n=n, up = upregulated)
  
  counts.top <- counts(dds, normalized=T)[names.top, ]
  metadata <- colData(se) %>% as.data.frame()
  gene_names <- rowData(se)[, c("gene_name"), drop = F] %>% as.data.frame()
  
  counts.plt <-
    data.frame(counts.top) %>%
    tibble::rownames_to_column(var = "ensembl") %>%
    tidyr::gather(key = "samplename",
           value = "normalized_counts", 2:13) %>%
    merge(metadata, by.x="samplename", by.y=0) %>%
    merge(gene_names, by.x="ensembl", by.y=0)  %>%
    mutate(genotype = factor(genotype)) %>%
    mutate(genotype = relevel(genotype, "WT"))  %>% # "WT" needs to be displayed before "KO"
    mutate(gene_name = forcats::fct_reorder(gene_name, normalized_counts, .desc = T))
  
  direction <- ifelse(upregulated, "up", "down")
  
  ggplot(counts.plt,
         aes(
           x = as.factor(gene_name),
           y = normalized_counts,
           fill = genotype
         )) +
    geom_dotplot(
      binaxis = 'y',
      stackdir = 'center',
      dotsize = 0.3,
      position = position_dodge(0.8),
      fill = "black"
    ) +
    geom_boxplot(outlier.size = 0.3) +
    scale_y_log10() +
    xlab("Genes") +
    ylab("Normalized Counts") +
    scale_fill_manual(values = condition_pal) +
    ggtitle(paste0("Top ", n, " ", direction, "-regulated Genes\n tissue: ", tissue)) +
    theme_bw() +
    theme(plot.title = element_text(hjust = 0.5)) +
    theme(axis.text.x = element_text(
      angle = 45,
      hjust = 1,
      face = setBold(counts.plt$gene_name, c("Nfe2l1"))
    ))
}

gastroc

boxplot.top(res.gastroc, dds.gastroc, se.gastroc, upregulated = T, tissue = "gastrocnemius")

boxplot.top(res.gastroc, dds.gastroc, se.gastroc, upregulated = F, tissue = "gastrocnemius")

soleus

boxplot.top(res.soleus, dds.soleus, se.soleus, upregulated = T, tissue = "soleus")

boxplot.top(res.soleus, dds.soleus, se.soleus, upregulated = F, tissue = "soleus")

Venn/Euler-Diagram

Here are some Venn or Euler (proportional Venn) diagrams to choose from.

Note: using Eulerr diagrams, yields inconsistent plots where the circle sets will be placed at different absolute positions each time the plot function is called. => Thus the plot might need to be called multiple times until the wanted constellation is obtained.

significant overview with all genes:

venn.colors <- c(genes = "white", params$tissue_pal)

gene_sets <- c(
  "all genes" = sign_gene_stats$`all genes` - sign_gene_stats$shared_sig_genes,
  "all genes&gastroc" = sign_gene_stats$gastroc - sign_gene_stats$shared_sig_genes,
  "all genes&soleus" = sign_gene_stats$soleus - sign_gene_stats$shared_sig_genes,
  "all genes&gastroc&soleus" = sign_gene_stats$shared_sig_genes
)

# 1st option: euler
plot(euler(gene_sets),
     legend = list(side = "right") ,
     main = "significant genes per tissue",
     fills = venn.colors)

# 2nd option: venn
plot(venn(gene_sets),
     main = "significant genes per tissue",
     fills = venn.colors)

significant grouped:

col_palette <- RColorBrewer::brewer.pal(8, "Dark2")
# venn.colors <- c(gastroc = "#FFB08E", soleus = "#FF6638", col_palette[1:4])
venn.colors <- c(params$tissue_pal, scales::hue_pal()(4))

# sets for the venn/euler diagram (exclusive counts)
gene_sets <- c(
  "gastrocnemius" = sign_gene_stats$gastroc - sign_gene_stats$shared_sig_genes,
  "soleus" = sign_gene_stats$soleus - sign_gene_stats$shared_sig_genes,
  "gastroc&soleus" = sign_gene_stats$shared_sig_genes,
  "gastroc&soleus&both down" = sign_gene_stats$`both down`,
  "gastroc&soleus&both up" = sign_gene_stats$`both up`,
  "gastroc&soleus&ga up, sol down" = sign_gene_stats$`ga up, sol down`,
  "gastroc&soleus&ga down, sol up" = sign_gene_stats$`ga down, sol up`
)
# 'gastroc&soleus': if this line is omitted, then all intersects will be denoted 
# with 0 (which was the goal to omit)

# 1st option: eulerr
plot(
  euler(gene_sets),
  quantities = T,
  legend = list(side = "right"),
  fills = venn.colors,
  main = "significant genes"
)
## Warning in colSums(id & !empty) == 0 | merged_sets: longer object length is not a multiple of shorter object length

# only the two tissues
gene_sets <- c(
  "gastrocnemius" = sign_gene_stats$gastroc - sign_gene_stats$shared_sig_genes,
  "soleus" = sign_gene_stats$soleus - sign_gene_stats$shared_sig_genes,
  "gastroc&soleus" = sign_gene_stats$shared_sig_genes
)


# 2nd option: euler two tissues
p <- plot(
  euler(gene_sets),
  quantities = T,
  legend = list(side = "right"),
  fills = venn.colors,
  main = "significant genes"
)
ggsave("./plots/03_euler.svg", p)
## Saving 7 x 5 in image
ggsave("./plots/03_euler.png", p)
## Saving 7 x 5 in image
p

# 2nd option: venn two tissues
library(eulerr)
p <- plot(
  eulerr::venn(gene_sets),
  fills = params$tissue_pal,
  main = "significant genes"
)
ggsave("./plots/03_venn.svg", p)
## Saving 7 x 5 in image
ggsave("./plots/03_venn.png", p)
## Saving 7 x 5 in image
p

Heatmap

Replacing the raw counts with normalized counts for better visual representation of the performed DEA:

counts.gastroc <- counts(dds.gastroc, normalized = T)
counts.soleus <- counts(dds.soleus, normalized = T)
zscore <- function(M) {
  s <- apply( M,1,sd )      # standard deviation
  µ <- apply( M, 1, mean )  # mean
  M.z <- (M - µ) / s        # zscore
  return(M.z)
}

# obtaining all DE genes, but avoiding duplicates
sign_genes <-
  unique(c(
    row.names(gastroc_res.filtered),
    row.names(soleus_res.filtered)
  )) 
# making sure, that DE genes occur in both tissues
sign_genes <-
  sign_genes[sign_genes %in% rownames(counts.gastroc) &
               sign_genes %in% rownames(counts.soleus)]

counts_sign <- merge(
  counts.gastroc[sign_genes,],
  counts.soleus[sign_genes,],
  by = 0,
  suffixes = c("_gastrocnemius", "_soleus")
) %>% 
  tibble::column_to_rownames(var="Row.names") %>%
  as.matrix()
plotCHM <- function(counts.df) {
  # http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3
  morecols <- colorRampPalette(params$heatmap_pal)
  
  tissue_vec <- c(rep("gastroc", 12), rep("soleus", 12))
  condition_vec <- c(rep(c(rep("WT", 6), rep("KO", 6)),2))
  top_annot <-
    HeatmapAnnotation(
      tissue = tissue_vec,
      condition = condition_vec,
      col = list(tissue = params$tissue_pal, condition = params$condition_pal),
      gp = gpar(col = "darkgray"),
      show_legend = FALSE
    )
  
  ## sample numbers
  sample_nrs <- colnames(counts.df) %>% 
    stringr::str_extract("(?<=_)\\d{1,2}(?=_)")
  colnames(counts.df) = sample_nrs
  
  chm <- Heatmap(
    counts.df,
    row_title = "differentially expressed genes",
    name = "zscore",
    show_row_names = FALSE,
    show_column_names = TRUE,
    column_title = "samples",
    col = morecols(50),
    column_title_side = "bottom",
    top_annotation = top_annot,
    column_names_gp = gpar(col = rep(unname(params$tissue_pal), each=12))
  )
  
  # creating custom annotation legend (to obtain the gray border)
  condition_legend = Legend(
    labels = c("WT", "KO"),
    legend_gp = gpar(fill = params$condition_pal),
    border = "darkgray",
    title = "condition"
  )
  tissue_legend = Legend(
    labels = c("gastrocnemius", "soleus"),
    legend_gp = gpar(fill = unname(params$tissue_pal)),
    border = "darkgray",
    title = "tissue"
  )
  legend_list <- list(condition_legend, tissue_legend)
  
  draw(chm, annotation_legend_list = legend_list)
}

chm <- plotCHM(zscore(counts_sign))

svglite::svglite("./plots/03_heatmap_sign_genes.svg", width=6, height=6)
chm
dev.off()
png("./plots/03_heatmap_sign_genes.png", width=6, height=6, units = "in", res = 1200)
chm
dev.off()
## quartz_off_screen 
##                 2 
## quartz_off_screen 
##                 2

pca: significant genes

PCA is applied to the same significantly DE genes (normalized) as used in the heatmap.

pca <- prcomp(t(counts_sign), scale. = T)

pca.data <- data.frame(Sample = rownames(pca$x),
                     X = pca$x[, 1],
                     Y = pca$x[, 2]) %>%
mutate(condition = substr(Sample, 1, 2),
       tissue = stringr::str_extract(Sample,"[:alpha:]+$"),
       sample_nr = stringr::str_extract(Sample, "(?<=_)\\d{1,2}(?=_)"))

p <-
autoplot(
  pca,
  data = pca.data,
  colour = 'tissue',
  shape = 'condition',
  label.label = "sample_nr",
  label = TRUE,
  label.show.legend = FALSE,
  label.repel = TRUE
) +
  # ggtitle("PCA significantly DE genes") +
  scale_color_manual(values = unname(params$tissue_pal)) +
  theme_bw()
ggsave(filename = "plots/02_pca_dea_sign.svg", p)
## Saving 7 x 5 in image
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider increasing max.overlaps
ggsave(filename = "plots/02_pca_dea_sign.png", p)
## Saving 7 x 5 in image
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider increasing max.overlaps
p
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider increasing max.overlaps

save R ojects

save(dds.gastroc, dds.soleus, res.gastroc, res.soleus, file = "./data/Robjects/03_DDS.RData")